import praw
from tqdm import tqdm
import re
from collections import Counter
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords
import numpy as np
from praw.models.reddit.comment import Comment
reddit = praw.Reddit(
client_id="",
client_secret="",
user_agent="Comment Extraction",
)
def getAllCommentsFromHotPosts(sub,postlimit=20,commentlimit=5,fullComment=False):
subreddit = reddit.subreddit(sub)
out = []
subms = list(subreddit.hot(limit=postlimit))
for i in tqdm(range(len(subms))):
submission = subms[i]
submission.comments.replace_more(limit=commentlimit)
for comment in submission.comments.list():
if fullComment:
out.append(comment)
else:
out.append(comment.body)
return out
wsbComments = getAllCommentsFromHotPosts("wallstreetbets",fullComment=True)
len(wsbComments)
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:44<00:00, 2.20s/it]
6876
def getAllBodies(comments):
return list(map(lambda c : c.body, comments))
def getAllTimes(comments):
return list(map(lambda c : c.created_utc, comments))
def setAllBodies(comments,newBodies):
assert len(comments)==len(newBodies)
for comm,newbody in zip(comments,newBodies):
if len(newbody):
comm.body = newbody
def filterEmptyBodies(comments):
return list(filter(lambda c : len(c.body)>0,comments))
def preprocess(data,skipEmpty=True):
newdata = []
for text in data:
# delete user reports,removed,deleted
if re.search(r'\*\*User Report\*\*|\[removed\]|\[deleted\]', text):
text = ""
# remove URLS
text = re.sub(r'http\S+', ' ', text)
# remove imgs
text = re.sub(r'!\[img\]\S+', ' ', text)
# remove brackets
text = re.sub(r'\[|\]', ' ', text)
# remove newlines
text = text.replace('\n'," ")
# remove double spaces
text = text.replace(' '," ")
# strip
text = text.strip()
# lower
text = text.lower()
if len(text)==0 and skipEmpty:
continue
newdata.append(text)
return newdata
rawText = getAllBodies(wsbComments)
procText = preprocess(rawText,skipEmpty=False)
setAllBodies(wsbComments,procText)
tickers = pd.read_json('company_tickers.json', orient='index')[["ticker","title"]].iloc[:1500,:]
tickers["ticker"] = tickers["ticker"].str.lower()
tickers["title"] = tickers["title"].str.lower()
tickers["title_first"] = tickers["title"].str.split().str.get(0).str.replace(',', '').str.replace('.', '')
raw_all = getAllCommentsFromHotPosts("all",postlimit=40)
data_all = preprocess(raw_all)
len(data_all)
100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [01:16<00:00, 1.91s/it]
12341
def wordFreqs(data):
cnt = Counter()
totalwords = 0
for comment in data:
words = comment.split(' ')
for word in words:
word = word.replace('.','')
word = word.replace('!','')
word = word.replace('?','')
cnt.update([word])
totalwords+=len(words)
for k in cnt:
cnt[k] = np.log(cnt[k])-np.log(totalwords)
return cnt
backg_freqs = wordFreqs(data_all)
keywordToTicker = {}
for idx,row in tickers.iterrows():
ticker = row["ticker"]
title = row["title_first"]
thresh = -12.1
if backg_freqs[ticker]==0 or backg_freqs[ticker] < thresh:
keywordToTicker[ticker] = ticker
if backg_freqs[title]==0 or backg_freqs[title] < thresh:
keywordToTicker[title] = ticker
def bucketComments(comments,keyDict):
buckets = {}
for comment in comments:
usedTickers = set()
for word in comment.body.split(' '):
if word in keyDict:
ticker = keyDict[word]
if ticker not in usedTickers:
if ticker not in buckets:
buckets[ticker] = []
buckets[ticker].append(comment)
usedTickers.add(ticker)
return buckets
buckets = bucketComments(wsbComments,keywordToTicker)
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
def scoreTexts(texts, model, tokenizer, config):
encoded_input = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
output = model(**encoded_input)
scores = output[0][:].detach().numpy()
scores = softmax(scores,axis=1)
maxids = np.argsort(scores,axis=1)[:,-1]
labs = list(map(lambda i : config.id2label[i], maxids))
return scores,labs
def analyzeBuckets(buckets):
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
dfs = []
for ticker,comments in buckets.items():
bodies = getAllBodies(comments)
scores,labels = scoreTexts(bodies, model, tokenizer, config)
df = pd.DataFrame(scores,columns=["negative","neutral","positive"])
df["sentiment"] = labels
df["ticker"] = ticker
df["comment"] = bodies
df["created"] = pd.to_datetime(getAllTimes(comments),unit='s')
dfs.append(df)
out = pd.concat(dfs)[["comment","created","ticker","negative","neutral","positive","sentiment"]].reset_index(drop=True)
pos_adj = out['positive']+(0.5*out['neutral'])
out['pos_norm'] = (pos_adj - np.mean(pos_adj))/np.std(pos_adj)
return out
analyzed = analyzeBuckets(buckets)
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight'] - This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
analyzed
| comment | created | ticker | negative | neutral | positive | sentiment | pos_norm | |
|---|---|---|---|---|---|---|---|---|
| 0 | the year is 2100...we have ai powered robots e... | 2024-12-06 16:14:41 | sbux | 0.573358 | 0.385036 | 0.041605 | negative | -0.656183 |
| 1 | a starbucks grande mocha costs $205,000 trump ... | 2024-12-06 21:35:26 | sbux | 0.279288 | 0.629626 | 0.091086 | neutral | -0.012359 |
| 2 | i did that with my last $50 left in my account... | 2024-12-09 21:38:22 | sbux | 0.142672 | 0.389270 | 0.468058 | positive | 0.950121 |
| 3 | my bad. this is a great write up. do you have ... | 2024-12-09 19:36:56 | sbux | 0.652351 | 0.198814 | 0.148835 | negative | -0.603266 |
| 4 | why is sbux a bad company? they seem great, an... | 2024-12-09 23:00:38 | sbux | 0.119699 | 0.235666 | 0.644635 | positive | 1.324084 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1158 | >!i am so poor, i switched from at&t postpaid ... | 2024-12-09 22:55:47 | t | 0.902450 | 0.085595 | 0.011955 | negative | -1.328475 |
| 1159 | i'm all in on tsla and rbrk right now. it's a ... | 2024-12-09 21:12:10 | rbrk | 0.003167 | 0.023927 | 0.972906 | positive | 2.157657 |
| 1160 | shit i’m full port alab till 250b market cap | 2024-12-09 22:46:22 | alab | 0.809970 | 0.168262 | 0.021768 | negative | -1.136777 |
| 1161 | 4bagger on khc calls today. it’s been a minute... | 2024-12-09 22:52:29 | khc | 0.067815 | 0.842997 | 0.089187 | neutral | 0.380389 |
| 1162 | coinbase down and won't let me sell, puts on coin | 2024-12-10 03:33:29 | coin | 0.819851 | 0.170869 | 0.009280 | negative | -1.178695 |
1163 rows × 8 columns
import plotly
import plotly.express as px
plotly.offline.init_notebook_mode()
pos_norm_summary = analyzed.groupby('ticker').agg(
n=('pos_norm','count'),
avg_pos_norm_score=('pos_norm', 'mean')
).reset_index()
fig = px.treemap(
pos_norm_summary,
path=['ticker'],
values='n',
color='avg_pos_norm_score',
color_continuous_scale=["red","white","green"],
color_continuous_midpoint=0,
title="Normalized sentiment of stocks from /r/wallstreetbets comments"
)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()
pos_summary = analyzed[analyzed['sentiment']=='positive'].groupby('ticker').agg(
n_positive=('positive','count'),
avg_pos_score=('positive', 'mean')
).reset_index()
tot_pos = np.sum(analyzed['sentiment']=='positive')
fig = px.treemap(
pos_summary,
path=['ticker'],
values='n_positive',
color='avg_pos_score',
color_continuous_scale='Greens',
title=f'Total positive comments = {tot_pos}'
)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()
neg_summary = analyzed[analyzed['sentiment']=='negative'].groupby('ticker').agg(
n_negative=('negative','count'),
avg_neg_score=('negative', 'mean')
).reset_index()
tot_neg = np.sum(analyzed['sentiment']=='negative')
fig = px.treemap(
neg_summary,
path=['ticker'],
values='n_negative',
color='avg_neg_score',
color_continuous_scale='Reds',
title=f'Total negative comments = {tot_neg}'
)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()